The data was downloaded from https://www.kaggle.com/datasets/uciml/mushroom-classification.
Given features of a mushroom, we want to classify it as poisonous or edible.

Prepare the workspace

options(
  digits = 2,   # Just gives 2 significant digits
  scipen = 999, # The larger the value the more it avoids scientific notation when printing 
  warn = -1     # Supress warnings
)
rm(
  list = ls()
)

1. Load the data, convert binary target variable to numeric, drop constant column

library(magrittr) 
M <- readr::read_csv(
  file = "/Users/thienpham/Data Mining/data/mushrooms.csv",
  col_types = "ccccccccccccccccccccccc",     # Loading everything as a character in case ID:0234 so we keep 0 and not lose it
  name_repair = janitor::make_clean_names    # Cleans up column names if messy
) %>%
  dplyr::mutate(
    target = as.numeric(class == "e")        # Make new column called target that converts to binary 1 when var class == "e" (edible)
  ) %>%
  dplyr::select(-veil_type) %>%              # veil_type was dropped because it was constant (all same value = meaningless)
  as.data.frame()

2. Fill-in missing values with “N/A”

v_class <- sapply(
  X = M,
  FUN = class # class function tells you the type of object of each variable
)
v_class
##                    class                cap_shape              cap_surface 
##              "character"              "character"              "character" 
##                cap_color                  bruises                     odor 
##              "character"              "character"              "character" 
##          gill_attachment             gill_spacing                gill_size 
##              "character"              "character"              "character" 
##               gill_color              stalk_shape               stalk_root 
##              "character"              "character"              "character" 
## stalk_surface_above_ring stalk_surface_below_ring   stalk_color_above_ring 
##              "character"              "character"              "character" 
##   stalk_color_below_ring               veil_color              ring_number 
##              "character"              "character"              "character" 
##                ring_type        spore_print_color               population 
##              "character"              "character"              "character" 
##                  habitat                   target 
##              "character"                "numeric"
v_character <- names(v_class)[v_class == "character"] # prints out the names of variables that are character
v_character
##  [1] "class"                    "cap_shape"               
##  [3] "cap_surface"              "cap_color"               
##  [5] "bruises"                  "odor"                    
##  [7] "gill_attachment"          "gill_spacing"            
##  [9] "gill_size"                "gill_color"              
## [11] "stalk_shape"              "stalk_root"              
## [13] "stalk_surface_above_ring" "stalk_surface_below_ring"
## [15] "stalk_color_above_ring"   "stalk_color_below_ring"  
## [17] "veil_color"               "ring_number"             
## [19] "ring_type"                "spore_print_color"       
## [21] "population"               "habitat"
v_character <- v_character[v_character != "class"] # all character variables with exception of class, these are our predictor variables
for(j in v_character){
  M[is.na(M[,j]),j] <- "N/A"              # turn all missing values into "N/A"
  M[M[,j] %in% c("NA",""),j] <- "N/A"     # goes through every column, if theres an "NA" or "", change to "N/A"
}

3. Visualize target variable

We visualize our data to

library(ggplot2)
M_plot <- M %>%
  dplyr::group_by(class) %>%                                                # group_by happens under the hood, needed for next steps
  dplyr::summarise(n = dplyr::n()) %>%                                      # sum up group specified, names column n
  dplyr::ungroup() %>%                                                      # ungroup for next step
  dplyr::mutate(percent = round(100*n/sum(n)),label = paste0(n,", ",percent,"%")) %>%    # shows percentage 
  as.data.frame()


ggplot(M_plot) + 
  aes(x = class,y = n,fill = class,label = label) + 
  geom_col() + 
  geom_text(position = position_stack(0.5)) +
  labs(
    title = "Bar plot of class",
    caption = "Data source: https://www.kaggle.com/datasets/uciml/mushroom-classification"
  )

4. Convert categorical predictors to factors with levels ordered by frequency

Wouldnt normally do for ordinal data b/c it disrupts the order but makes sense for nominal

for(j in v_character) M[,j] <- forcats::fct_infreq(
  f = M[,j]
)

5. Sort categorical predictors by Chi-squared test p-value

This uses the Chi-squared test to establish an order of our predictors from most informative to least informative.

# Take the target variable and all the predictor variables, make a table, and then apply chi sq test on the table.
# This returns a vector of p
v_character <- names(sort(sapply( 
  X = v_character,
  FUN = function(j) chisq.test(
    x = table(M[,c("class",j)])
  )$p.value
)))
v_character
##  [1] "bruises"                  "odor"                    
##  [3] "gill_size"                "gill_color"              
##  [5] "stalk_surface_above_ring" "stalk_surface_below_ring"
##  [7] "stalk_color_above_ring"   "stalk_color_below_ring"  
##  [9] "ring_type"                "spore_print_color"       
## [11] "population"               "habitat"                 
## [13] "stalk_root"               "gill_spacing"            
## [15] "cap_shape"                "ring_number"             
## [17] "cap_color"                "cap_surface"             
## [19] "veil_color"               "gill_attachment"         
## [21] "stalk_shape"
# This is what the table look like in order to calculate the chi sq test
table(M[,c("class","bruises")])
##      bruises
## class    f    t
##     e 1456 2752
##     p 3292  624

6. Visualize categorical predictors with binary target variable

These visualizations help us

Dr. Smith suppressed a lot of warning messages from this chunk.

v_color <- c(
  "red","forestgreen","skyblue"
)
names(v_color) <- c(
  "(0,33]","(33,67]","(67,100]"    # creates 3 color categories corresponding to 3 percentage range
)
 

for(j in v_character){                      # goes through for each predictor variable
  M_plot <- M %>%
    dplyr::select_("class",j) %>%           # selects only the target variable and jth predictor variable
    dplyr::group_by_("class",j) %>%         # then groups by levels of the target variable then the predictor variable
    dplyr::summarise(n = dplyr::n()) %>%    # then sums up each combination
    dplyr::group_by_(j) %>%                 # now group by the predictor variable, then calculates percentage
    dplyr::mutate(percent = round(100*n/sum(n)),label = paste0(n,"\n",percent,"%"),percent = cut(percent,breaks = c(0,33,67,100)))
    
  
  
  p <- ggplot(M_plot) + 
    aes_string(x = j) +
    aes(y = class,label = label) + 
    geom_point(aes(size = n,color = percent)) +
    geom_text() + 
    scale_size_area(max_size = 20) + 
    scale_color_manual(values = v_color) + 
    theme_bw() + 
    labs(
      title = paste0("Bubble plot of class by ",j),
      subtitle = "Large red and blue bubbles are better for predictor variables.",
      caption = paste0(
        "Data source: https://www.kaggle.com/datasets/uciml/mushroom-classification",
        "\n",
        "Chi-squared test p-value: ",round(chisq.test(table(M[,c("class",j)]))$p.value,22)
      )
    )
  plot(
    x = p
  )
}

########################################## !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# For the first plot Bruises, we can automatically classify that a mushroom will be edible if there are bruises and poisonous if there # are no bruises. We also notice that as we progress down the predictor variables with higher chi sq p values (less significant) we    # will see more green percentage range indicating that the result are not as meaningful (we cant automatically classify right away     # without more extensive testing)

7. Use Cramer’s V for association analysis between categorical predictors

During feature selection we will use our association analysis to identify which predictors to keep and which to drop.
* The bigger and darker the circles, there is more association

M_CramerV <- DescTools::PairApply(
  x = M[,v_character],
  FUN = DescTools::CramerV
)
corrplot::corrplot(
  corr = M_CramerV,
  diag = FALSE,
  is.corr = FALSE,
  order = "hclust",
  hclust.method = "ward.D"
)

8. Use Cramer’s V to perform variable clustering

This will product a hierarchical clustering based on association between categorical variables. If 2 variables are highly associated then keeping both will result in colinearity problems. Goal is to go in and identify for feature selection purposes, if I remove a predictor variable, with another pick up the slack.

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_CramerV
      ),
    method = "ward.D"          # ward's D will give us more even subgroups
  )
)

# If we remove bruises, ring_type could still provide us with similar information. Reducing the number of predictor variables needed while still giving us as much information.

plot(
  x = hclust(
    d = as.dist(
      m = 1 - M_CramerV
    ),
    method = "single"          # single linkage does a better job at finding outliers 
  )
)

# With single linkage we can see that cap_surface and cap_shape are very different from all of the other predictor variables therefore we would not want to remove these variables since they contain unique information. 

9. Prepare the categorical and ordinal columns

To make model fitting faster, and to prevent over-fitting categorical and ordinal columns will be binned into two levels and represented with a binary column. The binning strategy will try for 50/50 bins meaning 2 groups.

Bin nominal levels into binary

two strategies

  • group by conditional mean
  • group low frequency levels into “Other”

After binning into 2 groups, the first group will be all 0’s in the binary predictor variable, and the second group all 1’s.

Which variables would use which method was decided beforehand. Normally you would do one method for all predictor variables and if there wasnt any clear binning groups then run the other method on the remaining variables.

v_mean <- c(
  "cap_color","bruises","gill_color","ring_type","spore_print_color","population","cap_shape","odor","stalk_shape","stalk_color_below_ring","stalk_color_above_ring"
)
v_other <- c(
  "veil_color","gill_attachment","ring_number","gill_spacing",
  "habitat",
  "cap_surface","gill_size","stalk_root","stalk_surface_above_ring","stalk_surface_below_ring"
)

A. group low frequecy levels into “Other”

General rule used for the “other” method is roughly 1/3 and 2/3 if no clear distinction, ex: a=.39 b=.23 c=.19 d=.12 e=.7 then bin a into 1 group and everything else into other.

for(j in v_other){
  v_table <- sort(
    x = table(
      x = M[,j]
    ),
    decreasing = TRUE
  )
  M[,j] <- as.numeric(
    x = M[,j] == names(v_table)[1]
  )
  v_table <- unclass(
    x = v_table
  )
  M_class <- data.frame(
    level = names(v_table),
    n = v_table,
    proportion = prop.table(v_table)
  )
  print(
    x = "---------------------------------------------------------------------"
  )
  print(
    x = j
  )
  print(
    x = M_class
  )
}
## [1] "---------------------------------------------------------------------"
## [1] "veil_color"
##   level    n proportion
## w     w 7924    0.97538
## n     n   96    0.01182
## o     o   96    0.01182
## y     y    8    0.00098
## [1] "---------------------------------------------------------------------"
## [1] "gill_attachment"
##   level    n proportion
## f     f 7914      0.974
## a     a  210      0.026
## [1] "---------------------------------------------------------------------"
## [1] "ring_number"
##   level    n proportion
## o     o 7488     0.9217
## t     t  600     0.0739
## n     n   36     0.0044
## [1] "---------------------------------------------------------------------"
## [1] "gill_spacing"
##   level    n proportion
## c     c 6812       0.84
## w     w 1312       0.16
## [1] "---------------------------------------------------------------------"
## [1] "habitat"
##   level    n proportion
## d     d 3148      0.387
## g     g 2148      0.264
## p     p 1144      0.141
## l     l  832      0.102
## u     u  368      0.045
## m     m  292      0.036
## w     w  192      0.024
## [1] "---------------------------------------------------------------------"
## [1] "cap_surface"
##   level    n proportion
## y     y 3244    0.39931
## s     s 2556    0.31462
## f     f 2320    0.28557
## g     g    4    0.00049
## [1] "---------------------------------------------------------------------"
## [1] "gill_size"
##   level    n proportion
## b     b 5612       0.69
## n     n 2512       0.31
## [1] "---------------------------------------------------------------------"
## [1] "stalk_root"
##   level    n proportion
## b     b 3776      0.465
## ?     ? 2480      0.305
## e     e 1120      0.138
## c     c  556      0.068
## r     r  192      0.024
## [1] "---------------------------------------------------------------------"
## [1] "stalk_surface_above_ring"
##   level    n proportion
## s     s 5176      0.637
## k     k 2372      0.292
## f     f  552      0.068
## y     y   24      0.003
## [1] "---------------------------------------------------------------------"
## [1] "stalk_surface_below_ring"
##   level    n proportion
## s     s 4936      0.608
## k     k 2304      0.284
## f     f  600      0.074
## y     y  284      0.035

B. group by conditional mean

for(j in v_mean){
  M_target <- M %>%
    dplyr::select_(j,"target") %>%
    dplyr::mutate_(j = as.character(j)) %>%
    dplyr::group_by_(j) %>%
    dplyr::summarise(target = mean(target,na.rm = TRUE),n = dplyr::n()) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(proportion = n/sum(n)) %>%
    dplyr::arrange(target) %>%
    dplyr::mutate(cumsum_ascending = cumsum(proportion)) %>%
    dplyr::arrange(dplyr::desc(cumsum_ascending)) %>%
    dplyr::mutate(cumsum_descending = 1 - cumsum(proportion)) %>%
    dplyr::arrange(cumsum_ascending) %>%
    dplyr::mutate(mean_cumsum = (cumsum_ascending + cumsum_descending)/2) %>%
    dplyr::arrange(mean_cumsum) %>%
    as.data.frame()
  v_j <- levels(M_target[,j])[M_target$mean_cumsum >= 0.5]
  M[,j] <- as.numeric(
    x = M[,j] %in% M_target[M_target$mean_cumsum >= 0.5,j]
  )
  print(
    x = "---------------------------------------------------------------------"
  )
  print(
    x = j
  )
  print(knitr::kable(
    M_target
  ))
}
## [1] "---------------------------------------------------------------------"
## [1] "cap_color"
## 
## 
## |cap_color | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b         |   0.29|  168|       0.02|             0.02|              0.00|        0.01|
## |y         |   0.37| 1072|       0.13|             0.15|              0.02|        0.09|
## |p         |   0.39|  144|       0.02|             0.17|              0.15|        0.16|
## |e         |   0.42| 1500|       0.18|             0.35|              0.17|        0.26|
## |n         |   0.55| 2284|       0.28|             0.64|              0.35|        0.50|
## |g         |   0.56| 1840|       0.23|             0.86|              0.64|        0.75|
## |w         |   0.69| 1040|       0.13|             0.99|              0.86|        0.93|
## |c         |   0.73|   44|       0.01|             1.00|              0.99|        0.99|
## |r         |   1.00|   16|       0.00|             1.00|              1.00|        1.00|
## |u         |   1.00|   16|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "bruises"
## 
## 
## |bruises | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |f       |   0.31| 4748|       0.58|             0.58|              0.00|        0.29|
## |t       |   0.82| 3376|       0.42|             1.00|              0.58|        0.79|
## [1] "---------------------------------------------------------------------"
## [1] "gill_color"
## 
## 
## |gill_color | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b          |   0.00| 1728|       0.21|             0.21|              0.00|        0.11|
## |r          |   0.00|   24|       0.00|             0.22|              0.21|        0.21|
## |h          |   0.28|  732|       0.09|             0.31|              0.22|        0.26|
## |g          |   0.33|  752|       0.09|             0.40|              0.31|        0.35|
## |p          |   0.57| 1492|       0.18|             0.58|              0.40|        0.49|
## |y          |   0.74|   86|       0.01|             0.59|              0.58|        0.59|
## |w          |   0.80| 1202|       0.15|             0.74|              0.59|        0.67|
## |k          |   0.84|  408|       0.05|             0.79|              0.74|        0.77|
## |n          |   0.89| 1048|       0.13|             0.92|              0.79|        0.86|
## |u          |   0.90|  492|       0.06|             0.98|              0.92|        0.95|
## |e          |   1.00|   96|       0.01|             0.99|              0.98|        0.99|
## |o          |   1.00|   64|       0.01|             1.00|              0.99|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "ring_type"
## 
## 
## |ring_type | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |l         |   0.00| 1296|       0.16|             0.16|              0.00|        0.08|
## |n         |   0.00|   36|       0.00|             0.16|              0.16|        0.16|
## |e         |   0.36| 2776|       0.34|             0.51|              0.16|        0.33|
## |p         |   0.79| 3968|       0.49|             0.99|              0.51|        0.75|
## |f         |   1.00|   48|       0.01|             1.00|              0.99|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "spore_print_color"
## 
## 
## |spore_print_color | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |r                 |   0.00|   72|       0.01|             0.01|              0.00|        0.00|
## |h                 |   0.03| 1632|       0.20|             0.21|              0.01|        0.11|
## |w                 |   0.24| 2388|       0.29|             0.50|              0.21|        0.36|
## |k                 |   0.88| 1872|       0.23|             0.73|              0.50|        0.62|
## |n                 |   0.89| 1968|       0.24|             0.98|              0.73|        0.86|
## |b                 |   1.00|   48|       0.01|             0.98|              0.98|        0.98|
## |o                 |   1.00|   48|       0.01|             0.99|              0.98|        0.99|
## |u                 |   1.00|   48|       0.01|             0.99|              0.99|        0.99|
## |y                 |   1.00|   48|       0.01|             1.00|              0.99|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "population"
## 
## 
## |population | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |v          |   0.30| 4040|       0.50|             0.50|              0.00|        0.25|
## |y          |   0.62| 1712|       0.21|             0.71|              0.50|        0.60|
## |s          |   0.71| 1248|       0.15|             0.86|              0.71|        0.78|
## |c          |   0.85|  340|       0.04|             0.90|              0.86|        0.88|
## |n          |   1.00|  400|       0.05|             0.95|              0.90|        0.93|
## |a          |   1.00|  384|       0.05|             1.00|              0.95|        0.98|
## [1] "---------------------------------------------------------------------"
## [1] "cap_shape"
## 
## 
## |cap_shape | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:---------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |c         |   0.00|    4|       0.00|             0.00|              0.00|        0.00|
## |k         |   0.28|  828|       0.10|             0.10|              0.00|        0.05|
## |f         |   0.51| 3152|       0.39|             0.49|              0.10|        0.30|
## |x         |   0.53| 3656|       0.45|             0.94|              0.49|        0.72|
## |b         |   0.89|  452|       0.06|             1.00|              0.94|        0.97|
## |s         |   1.00|   32|       0.00|             1.00|              1.00|        1.00|
## [1] "---------------------------------------------------------------------"
## [1] "odor"
## 
## 
## |odor | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |f    |   0.00| 2160|       0.27|             0.27|              0.00|        0.13|
## |s    |   0.00|  576|       0.07|             0.34|              0.27|        0.30|
## |y    |   0.00|  576|       0.07|             0.41|              0.34|        0.37|
## |p    |   0.00|  256|       0.03|             0.44|              0.41|        0.42|
## |c    |   0.00|  192|       0.02|             0.46|              0.44|        0.45|
## |m    |   0.00|   36|       0.00|             0.47|              0.46|        0.47|
## |n    |   0.97| 3528|       0.43|             0.90|              0.47|        0.68|
## |a    |   1.00|  400|       0.05|             0.95|              0.90|        0.93|
## |l    |   1.00|  400|       0.05|             1.00|              0.95|        0.98|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_shape"
## 
## 
## |stalk_shape | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:-----------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |e           |   0.46| 3516|       0.43|             0.43|              0.00|        0.22|
## |t           |   0.56| 4608|       0.57|             1.00|              0.43|        0.72|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_color_below_ring"
## 
## 
## |stalk_color_below_ring | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b                      |   0.00|  432|       0.05|             0.05|              0.00|        0.03|
## |c                      |   0.00|   36|       0.00|             0.06|              0.05|        0.06|
## |y                      |   0.00|   24|       0.00|             0.06|              0.06|        0.06|
## |n                      |   0.12|  512|       0.06|             0.12|              0.06|        0.09|
## |p                      |   0.31| 1872|       0.23|             0.35|              0.12|        0.24|
## |w                      |   0.62| 4384|       0.54|             0.89|              0.35|        0.62|
## |g                      |   1.00|  576|       0.07|             0.96|              0.89|        0.93|
## |o                      |   1.00|  192|       0.02|             0.99|              0.96|        0.98|
## |e                      |   1.00|   96|       0.01|             1.00|              0.99|        0.99|
## [1] "---------------------------------------------------------------------"
## [1] "stalk_color_above_ring"
## 
## 
## |stalk_color_above_ring | target|    n| proportion| cumsum_ascending| cumsum_descending| mean_cumsum|
## |:----------------------|------:|----:|----------:|----------------:|-----------------:|-----------:|
## |b                      |   0.00|  432|       0.05|             0.05|              0.00|        0.03|
## |c                      |   0.00|   36|       0.00|             0.06|              0.05|        0.06|
## |y                      |   0.00|    8|       0.00|             0.06|              0.06|        0.06|
## |n                      |   0.04|  448|       0.06|             0.11|              0.06|        0.09|
## |p                      |   0.31| 1872|       0.23|             0.34|              0.11|        0.23|
## |w                      |   0.62| 4464|       0.55|             0.89|              0.34|        0.62|
## |g                      |   1.00|  576|       0.07|             0.96|              0.89|        0.93|
## |o                      |   1.00|  192|       0.02|             0.99|              0.96|        0.98|
## |e                      |   1.00|   96|       0.01|             1.00|              0.99|        0.99|

10. Check results

summary(
  object = M
)
##     class             cap_shape     cap_surface    cap_color       bruises    
##  Length:8124        Min.   :0.00   Min.   :0.0   Min.   :0.00   Min.   :0.00  
##  Class :character   1st Qu.:0.00   1st Qu.:0.0   1st Qu.:0.00   1st Qu.:0.00  
##  Mode  :character   Median :1.00   Median :0.0   Median :0.00   Median :0.00  
##                     Mean   :0.51   Mean   :0.4   Mean   :0.36   Mean   :0.42  
##                     3rd Qu.:1.00   3rd Qu.:1.0   3rd Qu.:1.00   3rd Qu.:1.00  
##                     Max.   :1.00   Max.   :1.0   Max.   :1.00   Max.   :1.00  
##       odor      gill_attachment  gill_spacing    gill_size      gill_color  
##  Min.   :0.00   Min.   :0.00    Min.   :0.00   Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:1.00    1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.00  
##  Median :1.00   Median :1.00    Median :1.00   Median :1.00   Median :0.00  
##  Mean   :0.53   Mean   :0.97    Mean   :0.84   Mean   :0.69   Mean   :0.42  
##  3rd Qu.:1.00   3rd Qu.:1.00    3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :1.00    Max.   :1.00   Max.   :1.00   Max.   :1.00  
##   stalk_shape     stalk_root   stalk_surface_above_ring
##  Min.   :0.00   Min.   :0.00   Min.   :0.00            
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.00            
##  Median :1.00   Median :0.00   Median :1.00            
##  Mean   :0.57   Mean   :0.46   Mean   :0.64            
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00            
##  Max.   :1.00   Max.   :1.00   Max.   :1.00            
##  stalk_surface_below_ring stalk_color_above_ring stalk_color_below_ring
##  Min.   :0.00             Min.   :0.00           Min.   :0.00          
##  1st Qu.:0.00             1st Qu.:0.00           1st Qu.:0.00          
##  Median :1.00             Median :1.00           Median :1.00          
##  Mean   :0.61             Mean   :0.66           Mean   :0.65          
##  3rd Qu.:1.00             3rd Qu.:1.00           3rd Qu.:1.00          
##  Max.   :1.00             Max.   :1.00           Max.   :1.00          
##    veil_color    ring_number     ring_type    spore_print_color   population 
##  Min.   :0.00   Min.   :0.00   Min.   :0.00   Min.   :0.0       Min.   :0.0  
##  1st Qu.:1.00   1st Qu.:1.00   1st Qu.:0.00   1st Qu.:0.0       1st Qu.:0.0  
##  Median :1.00   Median :1.00   Median :0.00   Median :0.0       Median :1.0  
##  Mean   :0.98   Mean   :0.92   Mean   :0.49   Mean   :0.5       Mean   :0.5  
##  3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.00   3rd Qu.:1.0       3rd Qu.:1.0  
##  Max.   :1.00   Max.   :1.00   Max.   :1.00   Max.   :1.0       Max.   :1.0  
##     habitat         target    
##  Min.   :0.00   Min.   :0.00  
##  1st Qu.:0.00   1st Qu.:0.00  
##  Median :0.00   Median :1.00  
##  Mean   :0.39   Mean   :0.52  
##  3rd Qu.:1.00   3rd Qu.:1.00  
##  Max.   :1.00   Max.   :1.00

Now the next step is feature selection and then model building. # Save prepared data We want to see the minimum = 0, max = 1, and mean between 1/3 and 2/3. But if mean is outside of that range then its ok b/c theres noly so much you can do if youve done everything.

write.csv(
  x = M,
  file = "/Users/thienpham/Data Mining/data/prepared_mushrooms.csv",
  row.names = FALSE
)

Homework

Take your data for classification supervised learning and prepare your predictors and your target variable. You do not need to perform feature selection yet, we will do that in an upcoming assignment.